import json
import os
data_folder = "./blob_dir/debug_output/llava/data"
minigpt4_data = ["TaxCocoVizTextCapFilterV3GPTVDetail0713", "TaxCocoVizTextCapFilterV3GPTVInstruct0713", "TaxCocoVizTextCapFilterV3GPTVNegInstruct0713"]

data_combined = {"annotations":[]}
image_ids = set()
for data_subfolder in minigpt4_data:
    data_file = f"{data_folder}/{data_subfolder}/data.json"
    data = json.load(open(data_file, "r"))
    for item in data["annotations"]:
        new_item = {k:v for k, v in item.items()}
        new_item["image_id"] = f'{data_subfolder}/image/' + item['image_id']
        image_ids.add(item['image_id'])
        data_combined["annotations"].append(new_item)

print(f"Number of instruction-response pairs: {len(data_combined['annotations'])}")
print(f"Number of images: {len(image_ids)}")
print(f"Average #instruction/image: {len(data_combined['annotations'])/len(image_ids)}")
json.dump(data_combined, open(f"{data_folder}/minigpt4_TaxCocoVizTextCapFilterV3GPTVDetail_Instruct_Neg0713.json", "w"))